#!/usr/bin/python
# Filename: replace_class.py
# Used for replace the rare words into different type of words
import re;

def Replace(word):#replace the words
	num = re.compile('^[0-9]+$');
	fircap = re.compile('^[A-Z]$');
	cap = re.compile('^[A-Z]+$');
	if num.match(word):#if it is all numbers, replace it with '_NUM_'
		return '_NUM_';
	elif fircap.match(word):#if it is begin with capital word, replace it with '_FIRST_CAPITAL_'
		return '_FIRST_CAPITAL_';
	elif cap.match(word):#if it it consist with all capital words, replace it with '_ALLCAP_'
		return '_ALLCAP_';
	else:
		return '_RARE_';
		
		

fr = file('ner_train.dat');
words_count = dict();
for line in fr:	#read in the file and store the word:freq into tbl
	words = line.strip().split(' ');
	if words_count.__contains__(words[0]):
			words_count[words[0]] += 1;
	else:
			words_count[words[0]] = 1;
fr.seek(0);
fw = file('ner_train_rep.dat','w');

for line in fr: #read the line in file and replace the low frequency words
	word = line.strip('\n').split(' ');
	if words_count[word[0]] > 5:
		fw.write(line);
	else:
		rep_line = Replace(word[0]) +' '+word[1]+'\n';
		print word[0];
		print Replace(word[0]);
		fw.write(rep_line);
fr.close();
fw.close();
		
		
